# 无法在程序内搜索小说然后下载，因为无法通过网址参数传递进行搜索。以后可以通过抓包再次尝试
# 保存到 txt 文件不便阅读，保存到 word 文件段落首行缩进过多，而且 word 文档在屏幕上展示的内容面积过小，最佳的文件格式是什么？
# 网页内有一些其他内容，尝试能否净化掉

import requests
from bs4 import BeautifulSoup

def req(url):
    headersvalue = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.50'
    }
    r = requests.get(url, headers=headersvalue)
    r.encoding = 'gbk'
    soup = BeautifulSoup(r.text, 'lxml')
    return soup

def get_content(book_url):
    soup_book = req(book_url)
    book_name = soup_book.find('span', attrs={'class': 'novelname'}).text
    node_a = soup_book.find_all('div', attrs={'class': 'listchapter'})[1].find_all('a')
    n = 0
    for a in node_a:
        chapter_name = a.text
        chapter_url = book_url + a.attrs['href']
        soup_chapter = req(chapter_url)
        content = soup_chapter.find('div', attrs={'class': 'chapter_content'}).text
        # 与 lxml 库不同，上一行代码可以获取到本章的全部内容，而且自动处理掉了段落之间的两个 <br>
        with open(f'd:/{book_name}.txt', 'a+', encoding='utf-8') as f:
            f.write(f'''
            ----------------
            {chapter_name} 
            ----------------
            
            {content}
            

            ====================================================================
            ''')
        print(f'{chapter_name} 爬取成功...')

get_content('https://yuejuwx.com/0/16/')